data description

The dataset consists of over 250,000 wine review entries. Each entry contains the country the wine came from, a description of the wine, the designation (vineyard within the winery where the grapes are from), the number of points WineEnthusiast rated the wine on a scale of 1-100, the price for a bottle of wine, the province or state the wine is from, the region or wine growing area, the variety or type of grape used to make the wine, and, lastly, the winery

1st thing to explore : Which countries are the best wines from?

# assigning a correct class; summarising which coutry produces the best wine 
wine$country <- as.factor(wine$country)
wine$points <- as.numeric(wine$points)

mean_score <- aggregate(wine$points, by=list(wine$country), mean) 
mean_score <- mean_score %>% top_n(10)
## Selecting by x
# checking for normal distribution of points
ggplot(wine, aes(log(points))) + geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# renaming columns
colnames(mean_score)[1] <- "country"
colnames(mean_score)[2] <- "score"


# graph displaying the countries where the best wine comes from


ggplot(data = mean_score, mapping = aes(x = reorder(country,score ),y=score,main="best wine",fill=country)) +geom_bar(stat="identity")+coord_flip(ylim=c(85,92))+scale_fill_brewer(palette = "Paired") 

#looking at the most expensive countries
mean_price <- aggregate(wine$price, by=list(wine$country), mean) 
mean_price <- mean_price %>% top_n(10)
## Selecting by x
# renaming columns
colnames(mean_price)[1] <- "country"
colnames(mean_price)[2] <- "price"

# checking for normal distribution 
ggplot(wine, aes(log(price))) + geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# a graph with the top 10 most expensive wines (by country)
ggplot(data = mean_price, mapping = aes(x = reorder(country, price),y=price,main="most expensive wine",fill=country)) +geom_bar(stat="identity")+coord_flip(ylim=c(30,65))+scale_fill_brewer(
palette = "Spectral") 

ggscatter(wine, x = "points", y = "price", color="red" ,shape=21,
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "points", ylab = "price(US dollars)")
## `geom_smooth()` using formula 'y ~ x'

cor.test(wine$price, wine$points, method="pearson") # there is a positive (and signitficant)correlation between th eprice and the quality c = 0.4374798 (~0.44), p-value<0.05
## 
##  Pearson's product-moment correlation
## 
## data:  wine$price and wine$points
## t = 247.18, df = 258143, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4343552 0.4405938
## sample estimates:
##       cor 
## 0.4374798
# mean score and mean price
mean(wine$price)
## [1] 34.1796
mean(wine$points)
## [1] 88.08486
## additional inspection of my data :
#looking at the best wineries

mean_score_winery <- aggregate(wine$points, by=list(wine$winery), mean) 
mean_score_winery <- mean_score_winery %>% top_n(10)
## Selecting by x
levels(mean_score_winery)
## NULL
#looking at the best regions
mean_score_region <- aggregate(wine$points, by=list(wine$province), mean) 
mean_score_region <- mean_score_region %>% top_n(11)
## Selecting by x

# identifying the 10 most popular wine varieties :
count_w <- count(wine,"variety") %>% arrange (desc(freq))%>% head(10)

# most popular wines :
wine_list <- c("Pinot Noir","Chardonnay","Cabernet Sauvignon","Red Blend","Sauvignon Blanc","Riesling","Bordeaux-style Red Blend","Syrah","Merlot","Zinfandel")

pop_wine <- filter(wine,variety%in%wine_list)

# data frames per wine 
pinot_noir <- filter(wine,variety=="Pinot Noir")
chardonnay<- filter(wine,variety=="Chardonnay")
cabernet <- filter(wine,variety=="Cabernet Sauvignon")
red_blend <- filter(wine,variety=="Red Blend")
sauvignon_blanc <- filter(wine,variety=="Sauvignon Blanc")
riesling <- filter(wine,variety=="Riesling")
bordeaux <- filter(wine,variety=="Bordeaux-style Red Blend")
syrah <- filter(wine,variety=="Syrah")
merlot <- filter(wine,variety=="Merlot")
zinfandel <- filter(wine,variety=="Zinfandel")
                   


#Creating a vector containing only the text for each wine - text corpus, deleting spaces, stop words and numbers + transforming to lower-case:

#pinot noir  
text_pinot <- pinot_noir$description
# Create a corpus  
docs_pinot <- Corpus(VectorSource(text_pinot))
docs_pinot <- docs_pinot %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_pinot <- tm_map(docs_pinot, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_pinot, content_transformer(tolower)):
## transformation drops documents
docs_pinot <- tm_map(docs_pinot, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_pinot, removeWords, stopwords("english")):
## transformation drops documents
dtm_pinot <- TermDocumentMatrix(docs_pinot) 
matrix_pinot <- as.matrix(dtm_pinot) 
words_pinot <- sort(rowSums(matrix_pinot),decreasing=TRUE) 
df_pinot <- data.frame(word = names(words_pinot),freq=words_pinot)


# chardonnay

text_chardonnay <- chardonnay$description
# Create a corpus  
docs_chardonnay <- Corpus(VectorSource(text_chardonnay))
docs_chardonnay <- docs_chardonnay %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_chardonnay <- tm_map(docs_chardonnay, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_chardonnay, content_transformer(tolower)):
## transformation drops documents
docs_chardonnay <- tm_map(docs_chardonnay, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_chardonnay, removeWords,
## stopwords("english")): transformation drops documents
dtm_chardonnay <- TermDocumentMatrix(docs_chardonnay) 
matrix_chardonnay <- as.matrix(dtm_chardonnay) 
words_chardonnay <- sort(rowSums(matrix_chardonnay),decreasing=TRUE) 
df_chardonnay <- data.frame(word = names(words_chardonnay),freq=words_chardonnay)


# cabernet sauvignon

text_cabernet <- cabernet$description
# Create a corpus  
docs_cabernet <- Corpus(VectorSource(text_cabernet))
docs_cabernet <- docs_cabernet %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_cabernet <- tm_map(docs_cabernet, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_cabernet, content_transformer(tolower)):
## transformation drops documents
docs_cabernet <- tm_map(docs_cabernet, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_cabernet, removeWords,
## stopwords("english")): transformation drops documents
dtm_cabernet <- TermDocumentMatrix(docs_cabernet) 
matrix_cabernet <- as.matrix(dtm_cabernet) 
words_cabernet <- sort(rowSums(matrix_cabernet),decreasing=TRUE) 
df_cabernet <- data.frame(word = names(words_cabernet),freq=words_cabernet)

# red blend

text_rblend <- red_blend$description
# Create a corpus  
docs_rblend <- Corpus(VectorSource(text_rblend ))
docs_rblend <- docs_rblend %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_rblend <- tm_map(docs_rblend, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_rblend, content_transformer(tolower)):
## transformation drops documents
docs_rblend <- tm_map(docs_rblend, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_rblend, removeWords, stopwords("english")):
## transformation drops documents
dtm_rblend <- TermDocumentMatrix(docs_rblend) 
matrix_rblend <- as.matrix(dtm_rblend) 
words_rblend <- sort(rowSums(matrix_rblend),decreasing=TRUE) 
df_rblend <- data.frame(word = names(words_rblend),freq=words_rblend)



# sauvignon blanc

text_sauvignon <- sauvignon_blanc$description
# Create a corpus  
docs_sauvignon <- Corpus(VectorSource(text_sauvignon ))
docs_sauvignon <- docs_sauvignon %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_sauvignon <- tm_map(docs_sauvignon, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_sauvignon, content_transformer(tolower)):
## transformation drops documents
docs_sauvignon <- tm_map(docs_sauvignon, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_sauvignon, removeWords,
## stopwords("english")): transformation drops documents
dtm_sauvignon <- TermDocumentMatrix(docs_sauvignon) 
matrix_sauvignon <- as.matrix(dtm_sauvignon) 
words_sauvignon <- sort(rowSums(matrix_sauvignon),decreasing=TRUE) 
df_sauvignon <- data.frame(word = names(words_sauvignon),freq=words_sauvignon)

# riesling

text_riesling <- riesling$description
# Create a corpus  
docs_riesling <- Corpus(VectorSource(text_riesling ))
docs_riesling <- docs_riesling %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_riesling <- tm_map(docs_riesling, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_riesling, content_transformer(tolower)):
## transformation drops documents
docs_riesling <- tm_map(docs_riesling, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_riesling, removeWords,
## stopwords("english")): transformation drops documents
dtm_riesling <- TermDocumentMatrix(docs_riesling) 
matrix_riesling <- as.matrix(dtm_riesling) 
words_riesling <- sort(rowSums(matrix_riesling),decreasing=TRUE) 
df_riesling <- data.frame(word = names(words_riesling),freq=words_riesling)


#bordeaux 
text_bordeaux <- bordeaux$description
# Create a corpus  
docs_bordeaux <- Corpus(VectorSource(text_bordeaux))
docs_bordeaux <- docs_bordeaux %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_bordeaux <- tm_map(docs_bordeaux, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_bordeaux, content_transformer(tolower)):
## transformation drops documents
docs_bordeaux <- tm_map(docs_bordeaux, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_bordeaux, removeWords,
## stopwords("english")): transformation drops documents
dtm_bordeaux <- TermDocumentMatrix(docs_bordeaux) 
matrix_bordeaux <- as.matrix(dtm_bordeaux) 
words_bordeaux <- sort(rowSums(matrix_bordeaux),decreasing=TRUE) 
df_bordeaux <- data.frame(word = names(words_bordeaux),freq=words_bordeaux)

#syrah 
text_syrah <- syrah$description
# Create a corpus  
docs_syrah <- Corpus(VectorSource(text_syrah))
docs_syrah <- docs_syrah %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_syrah <- tm_map(docs_syrah, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_syrah, content_transformer(tolower)):
## transformation drops documents
docs_syrah <- tm_map(docs_syrah, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_syrah, removeWords, stopwords("english")):
## transformation drops documents
dtm_syrah <- TermDocumentMatrix(docs_syrah) 
matrix_syrah <- as.matrix(dtm_syrah) 
words_syrah <- sort(rowSums(matrix_syrah),decreasing=TRUE) 
df_syrah <- data.frame(word = names(words_syrah),freq=words_syrah)

#merlot 
text_merlot <- merlot$description
# Create a corpus  
docs_merlot <- Corpus(VectorSource(text_merlot))
docs_merlot <- docs_merlot %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_merlot <- tm_map(docs_merlot, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_merlot, content_transformer(tolower)):
## transformation drops documents
docs_merlot <- tm_map(docs_merlot, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_merlot, removeWords, stopwords("english")):
## transformation drops documents
dtm_merlot <- TermDocumentMatrix(docs_merlot) 
matrix_merlot <- as.matrix(dtm_merlot) 
words_merlot <- sort(rowSums(matrix_merlot),decreasing=TRUE) 
df_merlot <- data.frame(word = names(words_merlot),freq=words_merlot)

#zinfandel 
text_zinfandel <- zinfandel$description
# Create a corpus  
docs_zinfandel <- Corpus(VectorSource(text_zinfandel))
docs_zinfandel <- docs_zinfandel %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs_zinfandel <- tm_map(docs_zinfandel, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs_zinfandel, content_transformer(tolower)):
## transformation drops documents
docs_zinfandel <- tm_map(docs_zinfandel, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs_zinfandel, removeWords,
## stopwords("english")): transformation drops documents
dtm_zinfandel <- TermDocumentMatrix(docs_zinfandel) 
matrix_zinfandel <- as.matrix(dtm_zinfandel) 
words_zinfandel <- sort(rowSums(matrix_zinfandel),decreasing=TRUE) 
df_zinfandel <- data.frame(word = names(words_zinfandel),freq=words_zinfandel)


# generating a wordcloud

# removing words that do not fit (verbs, obvious words)
# I am creating a list of words that do not fit to the wordcloud (too obvious, too generic)
frequent_words <- c("wine","drink","feel","champagne","-","la","flavors","now","pinot","finish","bottle","just","years","still","also","will","gives","mouth","feel","like","give","end","one","almost","much","many","along","chardonnay","cabernet","|","shows","zin","alcohol","finish","like","high","now","shows","will","big","well","almost","just","flavors","merlot","now","bit","noir","red","aromas","yet","theres","next","vineyard","notes","offers","wines","new","good","develop","way","blend","sauvignon","syrah","opens","delivers","slightly","made","make","ready","riesling","finishes","bordeaux","merlot","zinfandel","-","bordeauxstyle","little","nose","thats","structure","tastes","enough","character","best","long","touch","franc","lots")

# I am filtering them out 
df_pinot<-df_pinot[!(df_pinot$word %in% frequent_words),]
df_chardonnay<-df_chardonnay[!(df_chardonnay$word %in% frequent_words),]
df_cabernet<-df_cabernet[!(df_cabernet$word %in% frequent_words),]
df_rblend<-df_rblend[!(df_rblend$word %in% frequent_words),]
df_sauvignon<-df_sauvignon[!(df_sauvignon$word %in% frequent_words),]
df_riesling<-df_riesling[!(df_riesling$word %in% frequent_words),]
df_bordeaux<-df_bordeaux[!(df_bordeaux$word %in% frequent_words),]
df_syrah<-df_syrah[!(df_syrah$word %in% frequent_words),]
df_merlot<-df_merlot[!(df_merlot$word %in% frequent_words),]
df_zinfandel<-df_zinfandel[!(df_zinfandel$word %in% frequent_words),]


# generating a word cloud of 100 most frequent words used to describe a given kind of wine 
# set.seed(1234)  for reproducibility

wordcloud(words = df_pinot$word, freq = df_pinot$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, colors=viridis_pal(option = "plasma")(100))

Word clouds

# letter clouds for each wine (initials)
letterCloud(data=df_pinot, "PN",wordSize = 1, size = 2,color = "yellow")
letterCloud(data=df_chardonnay, "CH",wordSize = 1, size = 2,color = "pink")
letterCloud(data=df_cabernet, "CS",wordSize = 1, size = 2,color = "maroon")
letterCloud(data=df_rblend, "RB",wordSize = 1, size = 2,color = "orange")
letterCloud(data=df_sauvignon, "SB",wordSize = 1, size = 2,color = "dodgerblue")
letterCloud(data=df_riesling, "R",wordSize = 1, size = 2,color = "green")
letterCloud(data=df_bordeaux, "BX",wordSize = 1, size = 2,color = "tomato")
letterCloud(data=df_syrah, "SH",wordSize = 1, size = 2,color = "skyblue")
letterCloud(data=df_merlot, "M",wordSize = 1, size = 2,color = "palegreen")
letterCloud(data=df_zinfandel, "ZN",wordSize = 1, size = 2,color = "red")

Crating interactive maps- that answer questions: 1) where does the best wine come from? 2)Where does the most expensive wine come from ? 3)Is there a correlation between the quality and the price ?

# mean wine score by country - renaming columns
mean_score_by_country <- aggregate(wine$points, by=list(wine$country), mean) 
colnames(mean_score_by_country)[1] <- "country"
colnames(mean_score_by_country)[2] <- "points"

# preparing data needed for making the maps 
data(worldgeojson, package = "highcharter")
data(mean_score_by_country)
## Warning in data(mean_score_by_country): data set 'mean_score_by_country' not
## found
options(highcharter.theme = hc_theme_smpl(tooltip = list(valueDecimals = 2)))



# rounding thes score to 2 decimals 

mean_score_by_country$points <- round(mean_score_by_country$points,2)

#creating an interactive map displaying countries with the mean scores of wine
best_wine <- highchart() %>%
  hc_add_series_map(
    worldgeojson, mean_score_by_country, value = "points", joinBy =c("name",'country'),
    name = "Mean score"
    )  %>% 
  hc_colorAxis( min = 80, max = 92,minColor="pink",maxColor="#B2001B") %>% 
  hc_title(text = "World Map") %>% 
  hc_subtitle(text = "Best Wine in the World")


best_wine
#expensive wine 
mean_price_by_country <- aggregate(wine$price, by=list(wine$country), mean) 
colnames(mean_price_by_country)[1] <- "country"
colnames(mean_price_by_country)[2] <- "price"

# rounding thes score to 2 decimals 

mean_price_by_country$price <- round(mean_price_by_country$price,2)

# the same interactive map - here displaying the mean price for wine per country
expensive_wine <- highchart() %>%
  hc_add_series_map(
    worldgeojson, mean_price_by_country, value = "price", joinBy =c("name",'country'),
    name = "Mean price"
    )  %>% 
  hc_colorAxis( min = 10, max = 65,minColor="yellow",maxColor="red") %>% 
  hc_title(text = "World Map") %>% 
  hc_subtitle(text = "Most Expensive Wine in the World")

expensive_wine